# Read the Excel file
library(readxl)
library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr 1.1.4 ✔ readr 2.1.5
#> ✔ forcats 1.0.0 ✔ stringr 1.5.1
#> ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
#> ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
#> ✔ purrr 1.0.2
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag() masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
#>
#> Attaching package: 'janitor'
#>
#> The following objects are masked from 'package:stats':
#>
#> chisq.test, fisher.test
library(httpgd)
library(languageserver)
library(grid)
library(ggplot2)
library(shadowtext)
library(plotly)
#>
#> Attaching package: 'plotly'
#>
#> The following object is masked from 'package:ggplot2':
#>
#> last_plot
#>
#> The following object is masked from 'package:stats':
#>
#> filter
#>
#> The following object is masked from 'package:graphics':
#>
#> layout
hgd()
#> httpgd server running at:
#> http://127.0.0.1:62328/live?token=GQtx2NqN
setwd("C:/Users/jason/OneDrive - University of Cambridge/sem-lab-code/practice-code")
raw_tib <- read_excel("../data-depo/bailey-299-driver-genes-only.xlsx", sheet = 2)
#> New names:
#> • `` -> `...2`
#> • `` -> `...3`
#> • `` -> `...4`
#> • `` -> `...5`
#> • `` -> `...6`
#> • `` -> `...7`
#> • `` -> `...8`
#> • `` -> `...9`
#> • `` -> `...10`
#> • `` -> `...11`
#> • `` -> `...12`
# Convert correct row to column name
raw_tib <- raw_tib %>% row_to_names(row_number = 3)
print(raw_tib)
#> # A tibble: 739 × 12
#> Gene Cancer KEY Tumor suppressor or …¹ Decision `Tissue Frequency`
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 ABL1 PANCAN ABL1_PANC… <NA> rescued NA
#> 2 ACVR1 UCEC ACVR1_UCEC oncogene official 5.303030303030299…
#> 3 ACVR1B PANCAN ACVR1B_PA… possible tsg official NA
#> 4 ACVR2A COADREAD ACVR2A_CO… tsg official 2.848101265822779…
#> 5 ACVR2A LIHC ACVR2A_LI… possible tsg official 3.10734463276836E…
#> 6 ACVR2A PANCAN ACVR2A_PA… possible tsg official NA
#> 7 AJUBA PANCAN AJUBA_PAN… tsg official NA
#> 8 AJUBA HNSC AJUBA_HNSC tsg official 6.374501992031869…
#> 9 AKT1 CESC AKT1_CESC oncogene official 2.554744525547450…
#> 10 AKT1 PRAD AKT1_PRAD oncogene official 6.289308176100630…
#> # ℹ 729 more rows
#> # ℹ abbreviated name: ¹​`Tumor suppressor or oncogene prediction (by 20/20+)`
#> # ℹ 6 more variables: `Pancan Frequency` <chr>, `Consensus Score` <chr>,
#> # `Correlation adusted score` <chr>, Novel <chr>, `Rescue Notes` <chr>,
#> # `Note about previous publication` <chr>
tib <- raw_tib[-c(1:2), ]
# Filter only BLCA
blca_df <- tib %>% filter(Cancer == "BLCA")
# Arrange by the 'Tissue Frequency' column in descending order using BACKTICKS
blca_sorted <- blca_df %>% arrange(`Tissue Frequency`)
print(blca_sorted)
#> # A tibble: 45 × 12
#> Gene Cancer KEY Tumor suppressor or o…¹ Decision `Tissue Frequency`
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 FAT1 BLCA FAT1_BLCA tsg official 0.116580310880829
#> 2 ERBB3 BLCA ERBB3_BLCA oncogene official 0.119170984455959
#> 3 CREBBP BLCA CREBBP_BLCA tsg official 0.126943005181346…
#> 4 ERBB2 BLCA ERBB2_BLCA oncogene official 0.129533678756477…
#> 5 SPTAN1 BLCA SPTAN1_BLCA tsg official 0.132124352331605…
#> 6 ATM BLCA ATM_BLCA possible tsg official 0.134715025906736…
#> 7 EP300 BLCA EP300_BLCA tsg official 0.137305699481865
#> 8 ELF3 BLCA ELF3_BLCA possible tsg official 0.139896373056995…
#> 9 FGFR3 BLCA FGFR3_BLCA oncogene official 0.152849740932642
#> 10 STAG2 BLCA STAG2_BLCA tsg official 0.155440414507771…
#> # ℹ 35 more rows
#> # ℹ abbreviated name: ¹​`Tumor suppressor or oncogene prediction (by 20/20+)`
#> # ℹ 6 more variables: `Pancan Frequency` <chr>, `Consensus Score` <chr>,
#> # `Correlation adusted score` <chr>, Novel <chr>, `Rescue Notes` <chr>,
#> # `Note about previous publication` <chr>
# Turn TF into numbers
blca_sorted$`Tissue Frequency` <- as.numeric(blca_sorted$`Tissue Frequency`)
blca_sorted$`Correlation adusted score` <- as.numeric(blca_sorted$`Correlation adusted score`)
blca_sorted <- blca_sorted %>% arrange(`Tissue Frequency`)
# Set the factor levels for Gene based on Tissue Frequency
blca_sorted$Gene <- factor(blca_sorted$Gene, levels = blca_sorted$Gene)
view(blca_sorted)
# Create the ggplot
plt_gg <- ggplot(blca_sorted) +
geom_col(aes(x = Gene, y = `Tissue Frequency`), fill = "blue", width = 0.6) +
geom_text(aes(x = Gene, y = `Tissue Frequency`, label = sprintf("%0.2f", round(`Tissue Frequency`, digits = 2))),
hjust = -0.1, size = 2
) +
coord_flip() +
theme_minimal() +
labs(title = "Tissue Frequencies of Driver Genes in Bladder Cancer") +
theme(
axis.text.y = element_text(size = 5, margin = margin(r = -20)),
axis.text.x = element_text(size = 5)
)
plot(plt_gg)

blca_sorted$`Tissue Frequency` <- round(blca_sorted$`Tissue Frequency`, 4)
blca_sorted$`Correlation adusted score` <- round(blca_sorted$`Correlation adusted score`, 3)
blca_sorted$hover_text <- paste(
"Gene:", blca_sorted$Gene,
"<br>Tissue Frequency:",
blca_sorted$`Tissue Frequency`,
"<br>Correlation Adjusted Consensus Score:",
blca_sorted$`Correlation adusted score`,
"<br>TSG or Oncogene:",
blca_sorted$`Tumor suppressor or oncogene prediction (by 20/20+)`
) # nolint
# Horizontal bar plot with color intensity based on correlation adjusted consensus score
plt <- plot_ly(blca_sorted,
x = ~`Tissue Frequency`,
y = ~Gene,
type = "bar",
orientation = "h",
marker = list(
color = ~`Correlation adusted score`,
colorscale = "Viridis",
showscale = TRUE,
colorbar = list(title = list(text = "Correlation Adjusted Consensus Score", side = "right"))
),
text = ~hover_text,
hoverinfo = "text"
) %>%
layout(
xaxis = list(title = "Tissue Frequency"),
yaxis = list(
title = "Gene",
tickmode = "array",
tickvals = ~Gene,
ticktext = ~Gene,
tickfont = list(size = 8)
),
bargap = 0.3
)
plt
plot(plt_gg)
